{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import os\n",
    "from gensim.models import Word2Vec\n",
    "from tensorflow.keras.layers import (Bidirectional,\n",
    "                                     Embedding,\n",
    "                                     GRU, \n",
    "                                     GlobalAveragePooling1D,\n",
    "                                     GlobalMaxPooling1D,\n",
    "                                     Concatenate,\n",
    "                                     SpatialDropout1D,\n",
    "                                     BatchNormalization,\n",
    "                                     Dropout,\n",
    "                                     Dense,\n",
    "                                     Activation,\n",
    "                                     concatenate,\n",
    "                                     Input\n",
    "                                    )\n",
    "                        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取数据集\n",
    "train = pd.read_csv('../data/train_set.csv')\n",
    "test = pd.read_csv('../data/test_set.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据准备"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1、 Text to Sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50000, \n",
    "                                                  lower=False,filters=\"\")\n",
    "tokenizer.fit_on_texts(train['word_seg'].tolist()+test['word_seg'].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ = tokenizer.texts_to_sequences(train['word_seg'].values)\n",
    "test_ = tokenizer.texts_to_sequences(test['word_seg'].values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2、数据统计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1822.199999999997"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.percentile(list(map(lambda x: len(x),train_)),95)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3、数据截断、补全"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ = tf.keras.preprocessing.sequence.pad_sequences(train_, maxlen=1800,\n",
    "                                                      padding='pre',truncating='pre',value=0.0)\n",
    "test_ = tf.keras.preprocessing.sequence.pad_sequences(test_, maxlen=1800,\n",
    "                                                     padding='pre',truncating='pre',value=0.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4、词表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vocab = tokenizer.word_index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5、label处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "from tensorflow.keras.utils import to_categorical\n",
    "lb = LabelEncoder()\n",
    "train_label = lb.fit_transform(train['class'].values)\n",
    "train_label = to_categorical(train_label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6、Dataset数据读取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train,X_val,y_train,y_val = train_test_split(train_,train_label,\n",
    "                                               test_size=0.1,random_state=666)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(10000).batch(64)\n",
    "val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(128)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Word Embedding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1、word2vec模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "add word2vec finished....\n"
     ]
    }
   ],
   "source": [
    "all_data=pd.concat([train['article'],test['article']])\n",
    "file_name = '../embedding/Word2Vec_word_200.model'\n",
    "if not os.path.exists(file_name):\n",
    "    model = Word2Vec([document.split(' ')for document in all_data.values],\n",
    "                     size=200, \n",
    "                     window=5,\n",
    "                     iter=10, \n",
    "                     workers=11, \n",
    "                     seed=2018, \n",
    "                     min_count=2)\n",
    "\n",
    "    model.save(file_name)\n",
    "else:\n",
    "    model = Word2Vec.load(file_name)\n",
    "print(\"add word2vec finished....\")    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2、word embedding构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hcq/miniconda3/envs/python3/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  \"\"\"\n"
     ]
    }
   ],
   "source": [
    "count = 0\n",
    "\n",
    "embedding_matrix = np.zeros((len(word_vocab) + 1, 200))\n",
    "for word, i in word_vocab.items():\n",
    "    embedding_vector = model.wv[word] if word in model else None\n",
    "    if embedding_vector is not None:\n",
    "        count += 1\n",
    "        embedding_matrix[i] = embedding_vector\n",
    "    else:\n",
    "        unk_vec = np.random.random(200) * 0.5\n",
    "        unk_vec = unk_vec - unk_vec.mean()\n",
    "        embedding_matrix[i] = unk_vec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model(sent_length, embeddings_weight,class_num):\n",
    "    content = Input(shape=(sent_length,), dtype='int32')\n",
    "    embedding = Embedding(\n",
    "        name=\"word_embedding\",\n",
    "        input_dim=embeddings_weight.shape[0],\n",
    "        weights=[embeddings_weight],\n",
    "        output_dim=embeddings_weight.shape[1],\n",
    "        trainable=False)\n",
    "\n",
    "    x = SpatialDropout1D(0.2)(embedding(content))\n",
    "\n",
    "    x = Bidirectional(GRU(200, return_sequences=True))(x)\n",
    "    x = Bidirectional(GRU(200, return_sequences=True))(x)\n",
    "\n",
    "    avg_pool = GlobalAveragePooling1D()(x)\n",
    "    max_pool = GlobalMaxPooling1D()(x)\n",
    "\n",
    "    conc = concatenate([avg_pool, max_pool])\n",
    "    \n",
    "    x = Dense(1000)(conc)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Activation(activation=\"relu\")(x)\n",
    "    x = Dropout(0.2)(x)\n",
    "    x = Dense(500)(x)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Activation(activation=\"relu\")(x)\n",
    "    output = Dense(19, activation=\"softmax\")(x)\n",
    "\n",
    "    model = tf.keras.models.Model(inputs=content, outputs=output)\n",
    "    model.compile(loss='categorical_crossentropy', \n",
    "                  optimizer='adam', \n",
    "                  metrics=['accuracy'])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = build_model(1800, embedding_matrix,19)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"model\"\n",
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "input_1 (InputLayer)            [(None, 1800)]       0                                            \n",
      "__________________________________________________________________________________________________\n",
      "word_embedding (Embedding)      (None, 1800, 200)    254292200   input_1[0][0]                    \n",
      "__________________________________________________________________________________________________\n",
      "spatial_dropout1d (SpatialDropo (None, 1800, 200)    0           word_embedding[0][0]             \n",
      "__________________________________________________________________________________________________\n",
      "bidirectional (Bidirectional)   (None, 1800, 400)    482400      spatial_dropout1d[0][0]          \n",
      "__________________________________________________________________________________________________\n",
      "bidirectional_1 (Bidirectional) (None, 1800, 400)    722400      bidirectional[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "global_average_pooling1d (Globa (None, 400)          0           bidirectional_1[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "global_max_pooling1d (GlobalMax (None, 400)          0           bidirectional_1[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "concatenate (Concatenate)       (None, 800)          0           global_average_pooling1d[0][0]   \n",
      "                                                                 global_max_pooling1d[0][0]       \n",
      "__________________________________________________________________________________________________\n",
      "dense (Dense)                   (None, 1000)         801000      concatenate[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "batch_normalization (BatchNorma (None, 1000)         4000        dense[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "activation (Activation)         (None, 1000)         0           batch_normalization[0][0]        \n",
      "__________________________________________________________________________________________________\n",
      "dropout (Dropout)               (None, 1000)         0           activation[0][0]                 \n",
      "__________________________________________________________________________________________________\n",
      "dense_1 (Dense)                 (None, 500)          500500      dropout[0][0]                    \n",
      "__________________________________________________________________________________________________\n",
      "batch_normalization_1 (BatchNor (None, 500)          2000        dense_1[0][0]                    \n",
      "__________________________________________________________________________________________________\n",
      "activation_1 (Activation)       (None, 500)          0           batch_normalization_1[0][0]      \n",
      "__________________________________________________________________________________________________\n",
      "dense_2 (Dense)                 (None, 19)           9519        activation_1[0][0]               \n",
      "==================================================================================================\n",
      "Total params: 256,814,019\n",
      "Trainable params: 2,518,819\n",
      "Non-trainable params: 254,295,200\n",
      "__________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint,TensorBoard\n",
    "# 检查点保存至的目录\n",
    "checkpoint_dir = './training_checkpoints_bs64'\n",
    "\n",
    "# 检查点的文件名\n",
    "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt_{epoch}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "early_stopping = EarlyStopping(monitor='val_accuracy', patience=6)\n",
    "plateau = ReduceLROnPlateau(monitor=\"val_accuracy\", \n",
    "                            verbose=1, \n",
    "                            mode='max', \n",
    "                            factor=0.5, \n",
    "                            patience=3)\n",
    "checkpoint = ModelCheckpoint(checkpoint_prefix, \n",
    "                             monitor='val_accuracy',\n",
    "                             verbose=2, \n",
    "                             save_best_only=True,\n",
    "                             mode='max',\n",
    "                             save_weights_only=True)\n",
    "\n",
    "model.fit(train_ds,\n",
    "          epochs=30,\n",
    "          validation_data=val_ds,\n",
    "          callbacks=[early_stopping, plateau, checkpoint],\n",
    "          verbose=2)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 加载模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbf68c13b38>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型线下验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "valid's macro_f1: 0.7650360570681052\n"
     ]
    }
   ],
   "source": [
    "valid_prob = model.predict(val_ds)\n",
    "valid_pred = np.argmax(valid_prob,axis=1)\n",
    "valid_pred = lb.inverse_transform(valid_pred)\n",
    "\n",
    "val_true = np.argmax(y_val,axis=1)\n",
    "val_true = lb.inverse_transform(val_true)\n",
    "\n",
    "from sklearn.metrics import f1_score\n",
    "print (\"valid's macro_f1: %s\" % f1_score(val_true,valid_pred,average='macro'))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 结果提交"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ds = tf.data.Dataset.from_tensor_slices((test_,np.zeros((test_.shape[0],19)))).batch(128)\n",
    "test_prob = model.predict(test_ds)\n",
    "test_pred = np.argmax(test_prob,axis=1)\n",
    "test['class'] = lb.inverse_transform(test_pred)\n",
    "test[[\"id\",\"class\"]].to_csv(\"submission_dnn_baseline.csv\",index=False,header=True,encoding='utf-8')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
